This post includes notes that I took from watching a presentation given by Eduardo Ariño de la, the Chief Data Scientist at Domino Data Lab. In his presentation, he introduces 23 visualizations and the appropriate scenarios to use them. You can find the presentation here, Video- 23 Visualizations and when to use them. My goal is to familiarize myself with these plots by writing the code using mainly ggplot and datasets that I am familiar with.

Load Libraries

library(tidyverse)
library(lubridate)
library(readxl)
theme_set(theme_classic())
options(scipen=999)

Loading Global Superstore Dataset

global = read_excel("global_superstore.xls", sheet = 1, col_names = T)

Data Munging

# rename variables
global = rename(global, row_id = `Row ID`, order_id = `Order ID`, order_date = `Order Date`, ship_date = `Ship Date`, ship_mode = `Ship Mode`, customer_id = `Customer ID`, customer_name = `Customer Name`, segment = Segment, city = City, state = State, country = Country, postal_code = `Postal Code`, market = Market, region = Region, product_id = `Product ID`, category = Category, sub_category = `Sub-Category`, product_name = `Product Name`, sales = Sales, quantity = Quantity, discount = `Discount`, profit = `Profit`, shipping_cost = `Shipping Cost`, order_priority = `Order Priority`)

Adding new date columns based on weekday, day, month and year.

global = global %>% 
  mutate(weekday = wday(order_date, label = T),
        day = day(order_date),
        month = month(order_date, label = T),
        year = year(order_date))
global

Deviation

Deviation can be used to emphasize variation from a fixed reference point such as 0, average or any target.

diverging_profit =
global %>%
  mutate(gain_loss = ifelse(profit < 0, 'loss', 'gain')) %>%
  select(order_date, region, profit, gain_loss) %>%
  arrange(profit)
  ggplot(diverging_profit, aes(x = region, y = profit, label = profit)) +
  geom_bar(stat='identity', aes(fill=gain_loss), width=.8) + 
  labs(title = "Diverging Bar", subtitle = "Profit loss or gain from a fixed reference point of 0")+
  coord_flip()+
  scale_fill_manual(values = c("gain"="#1F77B4", "loss"="#FF7F0E"))

ggplot(diverging_profit, aes(x=region, y=profit, label=profit)) + 
  geom_point(stat='identity', aes(colour=gain_loss), size=2, alpha = 0.6) +  
  labs(title = "Diverging Dot Plot", 
       subtitle = "Profit loss or gain from a fixed reference point of 0") + 
  coord_flip() + scale_colour_manual(values = c("gain"="#1F77B4", "loss"="#FF7F0E"))

filter_2012 = global %>% 
  mutate(gain_loss = ifelse(profit > 0, 'gain', 'loss')) %>%
  filter(order_date < "2011-06-01")
  
ggplot(filter_2012, aes(order_date, profit, fill = gain_loss )) +
  geom_area() + labs(title = "Diverging Area Chart", subtitle = "Profit loss or gain from a fixed reference point of 0") + scale_fill_manual(values = c("gain"="#1F77B4", "loss"="#FF7F0E"))

Correlation

Correlation shows the relationship between two or more variables.

ggplot(global, aes(x = sales, y = profit)) + 
  geom_point(aes(col = segment), alpha = 0.6) +
  geom_smooth(method = "loess", se = F) + 
  labs(title = "Scatterplot with Smoothing Line Based on LOESS", subtitle = "Sales vs Profit") +
  scale_colour_manual(values = c("#1F77B4","#FF7F0E", "#2CA02C"))

Ranking

Ranking is useful where ordered list is important.

ranked_region = 
  global %>% 
  select(region, sales)%>%
  group_by(region) %>%
  summarise(sum = sum(sales)) %>%
  mutate(region = factor(region, levels = region[order(sum, decreasing = TRUE)]))
  
ggplot(ranked_region, aes(x = region, y = sum)) +
  geom_bar(stat = "identity", width = .8, fill = "#1F77B4")+ 
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
  labs(title = "Ranked Bar Chart", subtitle = "Sales by Region") 

NA
ggplot(ranked_region, aes(x = region, y = sum)) +
  geom_point(size = 3, col = "#1F77B4", alpha = 0.9)+
  geom_segment(aes(x = region, 
               xend = region,
               y = min(sum),
               yend = max(sum)),
              linetype = "dashed",
              size = 0.1) + 
  labs(title = "Dot Plot Ranking Bar", subtitle = "Sales vs Region") +
  coord_flip()

Distribution

Distribution plots show how often values of a variable occur.

ggplot(mpg, aes(x = hwy)) + 
  geom_histogram(bins = 7,col = "black", fill = "#1F77B4") +
  labs(title = "Histogram", subtitle = "Count of Highway Miles") 

Data munging

mtcars$cyl = as.factor(mtcars$cyl)
mtcars
ggplot(mtcars, aes(wt)) + 
  geom_density(aes(fill = factor(cyl)), alpha = 0.7) + xlim(0,6)+
  labs(title="Density plot", 
         subtitle="Weight (1000 lbs) per Cylinder",
         caption="Source: mtcars",
         x="Weight",
         fill="Cylinders") + 
  scale_fill_manual(values = c("#1F77B4", "#2CA02C","#FF7F0E"))

ggplot(mtcars, aes(cyl, qsec, group = cyl)) +
  geom_boxplot(varwidth = T, fill = "#1F77B4") +
  labs(title="Boxplot", 
         subtitle="1/4 mile time per Cylinder",
         caption="Source: mtcars",
         x="Cylinders")

Composition

Compositions graphs show how a single entity can be broken down into its components elements.

ggplot(global, aes(sub_category)) +
  geom_bar(aes(fill = category)) +
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
   labs(title="Composition", 
         subtitle="Counts of Sub-Category - Grouped by Category",
         caption="Source: Superstore",
         x="Sub-Category",
         fill = "Category" ) +
  scale_fill_manual(values = c("#1F77B4","#FF7F0E", "#2CA02C"))

NA
global

Stacked Column

ggplot(global, aes(market)) +
  geom_bar(aes(fill = category)) + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
  labs(title="Category Stacked - Bar Chart", 
       subtitle="Markets - Grouped by Category", 
       x = "Market",
       fill = "Category",
       caption="Source: Superstore") +
  scale_fill_manual(values = c("#1F77B4","#FF7F0E", "#2CA02C"))

global_treemap = global %>%
  filter(year == 2012, month == 'Jan') %>%
  mutate(country = as.factor(country),
          region = as.factor(region),
         market = as.factor(market))
library(treemapify)
treeMapCoordinates <- treemapify(
  global_treemap,
  area = "sales",
  fill = "profit",
  group = "market")
treeMapPlot <- ggplotify(treeMapCoordinates)
print(treeMapPlot)

Change

Change gives emphasis to changing trends.

year_2012 = 
  global %>% 
  filter(year == 2012)
  ggplot(year_2012, aes(order_date, sales)) + 
    geom_line(col = "#1F77B4") + 
    facet_wrap(~segment, nrow = 3) +
    labs(title="Change Trends", 
       subtitle="Sales by segment", 
       x = "Date",
       caption="Source: Superstore")

Heatmap - with heatmaps we can spot variations of a metric.

library(superheat)
superheat(mtcars_matrix,
          left.label.size = 0.3,
          left.label.text.size = 3,
          legend.text.size = 12,
          padding = .1)


Video - 23 Visualizations and when to use them

---
title: 'Visualization: When to use them!'
output:
  html_notebook: default
  pdf_document: default
---

This post includes notes that I took from watching a presentation given by Eduardo Ariño de la, the Chief Data Scientist at Domino Data Lab.  In his presentation, he introduces 23 visualizations and the appropriate scenarios to use them.
You can find the presentation here, [Video- 23 Visualizations and when to use them](https://blog.dominodatalab.com/video-23-visualizations-use/). My goal is to familiarize myself with these plots by writing the code using mainly ggplot and datasets that I am familiar with.

###Load Libraries
```{r}
library(tidyverse)
library(lubridate)
library(readxl)
theme_set(theme_classic())
options(scipen=999)
```
####Loading Global Superstore Dataset
```{r}
global = read_excel("global_superstore.xls", sheet = 1, col_names = T)
```

####Data Munging
```{r}
# rename variables
global = rename(global, row_id = `Row ID`, order_id = `Order ID`, order_date = `Order Date`, ship_date = `Ship Date`, ship_mode = `Ship Mode`, customer_id = `Customer ID`, customer_name = `Customer Name`, segment = Segment, city = City, state = State, country = Country, postal_code = `Postal Code`, market = Market, region = Region, product_id = `Product ID`, category = Category, sub_category = `Sub-Category`, product_name = `Product Name`, sales = Sales, quantity = Quantity, discount = `Discount`, profit = `Profit`, shipping_cost = `Shipping Cost`, order_priority = `Order Priority`)
```
#### Adding new date columns based on weekday, day, month and year.
```{r}
global = global %>% 
  mutate(weekday = wday(order_date, label = T),
        day = day(order_date),
        month = month(order_date, label = T),
        year = year(order_date))
```

```{r}
global
```


###Deviation 
####Deviation can be used to emphasize variation from a fixed reference point such as 0, average or any target. 
```{r}
diverging_profit =
global %>%
  mutate(gain_loss = ifelse(profit < 0, 'loss', 'gain')) %>%
  select(order_date, region, profit, gain_loss) %>%
  arrange(profit)

  ggplot(diverging_profit, aes(x = region, y = profit, label = profit)) +
  geom_bar(stat='identity', aes(fill=gain_loss), width=.8) + 
  labs(title = "Diverging Bar", subtitle = "Profit loss or gain from a fixed reference point of 0")+
  coord_flip()+
  scale_fill_manual(values = c("gain"="#1F77B4", "loss"="#FF7F0E"))

```


```{r}
ggplot(diverging_profit, aes(x=region, y=profit, label=profit)) + 
  geom_point(stat='identity', aes(colour=gain_loss), size=2, alpha = 0.6) +  
  labs(title = "Diverging Dot Plot", 
       subtitle = "Profit loss or gain from a fixed reference point of 0") + 
  coord_flip() + scale_colour_manual(values = c("gain"="#1F77B4", "loss"="#FF7F0E"))

```


```{r}
filter_2012 = global %>% 
  mutate(gain_loss = ifelse(profit > 0, 'gain', 'loss')) %>%
  filter(order_date < "2011-06-01")
  
ggplot(filter_2012, aes(order_date, profit, fill = gain_loss )) +
  geom_area() + labs(title = "Diverging Area Chart", subtitle = "Profit loss or gain from a fixed reference point of 0") + scale_fill_manual(values = c("gain"="#1F77B4", "loss"="#FF7F0E"))
```


###Correlation
####Correlation shows the relationship between two or more variables.
```{r}
ggplot(global, aes(x = sales, y = profit)) + 
  geom_point(aes(col = segment), alpha = 0.6) +
  geom_smooth(method = "loess", se = F) + 
  labs(title = "Scatterplot with Smoothing Line Based on LOESS", subtitle = "Sales vs Profit") +
  scale_colour_manual(values = c("#1F77B4","#FF7F0E", "#2CA02C"))

```


#### Loading fuel economy data from 1999 and 2008 for 38 popular models of car
```{r}
mpg
```


```{r}
 ggplot(mpg, aes(x = hwy, y = displ)) +
  geom_point(colour = "#FF7F0E")+ 
  geom_smooth(method = "loess", se = F) + 
  labs(title = "Scatterplot with Smoothing Line Based on LOESS", subtitle = "Engine Displacement vs Highway Miles Per Gallon")
```


### Ranking
####Ranking is useful where ordered list is important.
```{r}
ranked_region = 
  global %>% 
  select(region, sales)%>%
  group_by(region) %>%
  summarise(sum = sum(sales)) %>%
  mutate(region = factor(region, levels = region[order(sum, decreasing = TRUE)]))
  
ggplot(ranked_region, aes(x = region, y = sum)) +
  geom_bar(stat = "identity", width = .8, fill = "#1F77B4")+ 
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
  labs(title = "Ranked Bar Chart", subtitle = "Sales by Region") 
  
```


```{r}
ggplot(ranked_region, aes(x = region, y = sum)) +
  geom_point(size = 3, col = "#1F77B4", alpha = 0.9)+
  geom_segment(aes(x = region, 
               xend = region,
               y = min(sum),
               yend = max(sum)),
              linetype = "dashed",
              size = 0.1) + 
  labs(title = "Dot Plot Ranking Bar", subtitle = "Sales vs Region") +
  coord_flip()
```

###Distribution
Distribution plots show how often values of a variable occur.
```{r}
ggplot(mpg, aes(x = hwy)) + 
  geom_histogram(bins = 7,col = "black", fill = "#1F77B4") +
  labs(title = "Histogram", subtitle = "Count of Highway Miles") 
```

#### Data munging
```{r}
mtcars$cyl = as.factor(mtcars$cyl)
mtcars
```


```{r}
ggplot(mtcars, aes(wt)) + 
  geom_density(aes(fill = factor(cyl)), alpha = 0.7) + xlim(0,6)+
  labs(title="Density plot", 
         subtitle="Weight (1000 lbs) per Cylinder",
         caption="Source: mtcars",
         x="Weight",
         fill="Cylinders") + 
  scale_fill_manual(values = c("#1F77B4", "#2CA02C","#FF7F0E"))

```

```{r}
ggplot(mtcars, aes(cyl, qsec, group = cyl)) +
  geom_boxplot(varwidth = T, fill = "#1F77B4") +
  labs(title="Boxplot", 
         subtitle="1/4 mile time per Cylinder",
         caption="Source: mtcars",
         x="Cylinders")
```


###Composition 
#### Compositions graphs show how a single entity can be broken down into its components elements. 
```{r}
ggplot(global, aes(sub_category)) +
  geom_bar(aes(fill = category)) +
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
   labs(title="Composition", 
         subtitle="Counts of Sub-Category - Grouped by Category",
         caption="Source: Superstore",
         x="Sub-Category",
         fill = "Category" ) +
  scale_fill_manual(values = c("#1F77B4","#FF7F0E", "#2CA02C"))
  
```
```{r}
global
```


Stacked Column
```{r}
ggplot(global, aes(market)) +
  geom_bar(aes(fill = category)) + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
  labs(title="Category Stacked - Bar Chart", 
       subtitle="Markets - Grouped by Category", 
       x = "Market",
       fill = "Category",
       caption="Source: Superstore") +
  scale_fill_manual(values = c("#1F77B4","#FF7F0E", "#2CA02C"))
```


```{r} 
global_treemap = global %>%
  filter(year == 2012, month == 'Jan') %>%
  mutate(country = as.factor(country),
          region = as.factor(region),
         market = as.factor(market))

library(treemapify)
treeMapCoordinates <- treemapify(
  global_treemap,
  area = "sales",
  fill = "profit",
  group = "market")
treeMapPlot <- ggplotify(treeMapCoordinates)
print(treeMapPlot)
```


### Change 
####Change gives emphasis to changing trends.  
```{r}
year_2012 = 
  global %>% 
  filter(year == 2012)
  ggplot(year_2012, aes(order_date, sales)) + 
    geom_line(col = "#1F77B4") + 
    facet_wrap(~segment, nrow = 3) +
    labs(title="Change Trends", 
       subtitle="Sales by segment", 
       x = "Date",
       caption="Source: Superstore")
```

#### Heatmap - with heatmaps we can spot variations of a metric.
```{r}
library(superheat)
superheat(mtcars_matrix,
          left.label.size = 0.3,
          left.label.text.size = 3,
          legend.text.size = 12,
          padding = .1)
```


***
[Video - 23 Visualizations and when to use them](https://blog.dominodatalab.com/video-23-visualizations-use/)